<!DOCTYPE html>












  




<html class="theme-next gemini use-motion" lang="zh-CN">
<head>
  <!-- hexo-inject:begin --><!-- hexo-inject:end --><meta charset="UTF-8"/>
<meta name="google-site-verification" content="o9IkI77-fxkhBZW-n0ww9JALMCqdDbeTgdcXO_Bw4Zc" />
<meta name="baidu-site-verification" content="3frqY9KiVO" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2"/>
<meta name="theme-color" content="#222">



  
  
  <link rel="stylesheet" href="/lib/needsharebutton/needsharebutton.css">










<meta http-equiv="Cache-Control" content="no-transform" />
<meta http-equiv="Cache-Control" content="no-siteapp" />



















  
  
  
  

  
    
    
  

  
    
      
    

    
  

  

  
    
      
    

    
  

  
    
      
    

    
  

  
    
    
    <link href="//fonts.googleapis.com/css?family=Monda:300,300italic,400,400italic,700,700italic|Roboto Slab:300,300italic,400,400italic,700,700italic|Lobster Two:300,300italic,400,400italic,700,700italic|PT Mono:300,300italic,400,400italic,700,700italic&subset=latin,latin-ext" rel="stylesheet" type="text/css">
  






<link href="/lib/font-awesome/css/font-awesome.min.css?v=4.6.2" rel="stylesheet" type="text/css" />

<link href="/css/main.css?v=6.4.1" rel="stylesheet" type="text/css" />


  <link rel="apple-touch-icon" sizes="180x180" href="/images/logo.png?v=6.4.1">


  <link rel="icon" type="image/png" sizes="32x32" href="/images/logo.png?v=6.4.1">


  <link rel="icon" type="image/png" sizes="16x16" href="/images/logo.png?v=6.4.1">


  <link rel="mask-icon" href="/images/logo.svg?v=6.4.1" color="#222">









<script type="text/javascript" id="hexo.configurations">
  var NexT = window.NexT || {};
  var CONFIG = {
    root: '/',
    scheme: 'Gemini',
    version: '6.4.1',
    sidebar: {"position":"left","display":"post","offset":12,"b2t":false,"scrollpercent":false,"onmobile":false},
    fancybox: false,
    fastclick: false,
    lazyload: false,
    tabs: true,
    motion: {"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},
    algolia: {
      applicationID: '',
      apiKey: '',
      indexName: '',
      hits: {"per_page":10},
      labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
    }
  };
</script>


  




  <meta name="description" content="摘要：大数据技术与我们日常生活越来越紧密，要做大数据，首要解决数据问题。原始数据存在大量不完整、不一致、有异常的数据，严重影响到数据建模的执行效率，甚至可能导致模型结果的偏差，因此要数据预处。数据预处理主要是将原始数据经过文本抽取、数据清理、数据集成、数据处理、数据变换、数据降维等处理后，不仅提高了数据质量，而且更好的提升算法模型性能。数据预处理在数据挖掘、自然语言处理、机器学习、深度学习算法中">
<meta name="keywords" content="课程导学">
<meta property="og:type" content="article">
<meta property="og:title" content="Python数据预处理：机器学习、人工智能通用技术（1）">
<meta property="og:url" content="https://bainingchao.github.io/2018/12/24/Python数据预处理：机器学习、人工智能通用技术（1）/index.html">
<meta property="og:site_name" content="白宁超的官网">
<meta property="og:description" content="摘要：大数据技术与我们日常生活越来越紧密，要做大数据，首要解决数据问题。原始数据存在大量不完整、不一致、有异常的数据，严重影响到数据建模的执行效率，甚至可能导致模型结果的偏差，因此要数据预处。数据预处理主要是将原始数据经过文本抽取、数据清理、数据集成、数据处理、数据变换、数据降维等处理后，不仅提高了数据质量，而且更好的提升算法模型性能。数据预处理在数据挖掘、自然语言处理、机器学习、深度学习算法中">
<meta property="og:locale" content="zh-CN">
<meta property="og:image" content="https://i.imgur.com/SDWsDew.png">
<meta property="og:image" content="https://i.imgur.com/267NwEe.png">
<meta property="og:image" content="https://i.imgur.com/NpPy5tv.png">
<meta property="og:image" content="https://i.imgur.com/y29tD5L.png">
<meta property="og:image" content="https://i.imgur.com/BOWGJ3s.png">
<meta property="og:image" content="https://i.imgur.com/FONPEnW.png">
<meta property="og:image" content="https://i.imgur.com/xC3xztL.png">
<meta property="og:image" content="https://i.imgur.com/h5ANCh0.png">
<meta property="og:image" content="http://pub.idqqimg.com/wpa/images/group.png">
<meta property="og:image" content="https://i.imgur.com/NEXhm2W.png">
<meta property="og:updated_time" content="2019-03-06T08:23:24.885Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="Python数据预处理：机器学习、人工智能通用技术（1）">
<meta name="twitter:description" content="摘要：大数据技术与我们日常生活越来越紧密，要做大数据，首要解决数据问题。原始数据存在大量不完整、不一致、有异常的数据，严重影响到数据建模的执行效率，甚至可能导致模型结果的偏差，因此要数据预处。数据预处理主要是将原始数据经过文本抽取、数据清理、数据集成、数据处理、数据变换、数据降维等处理后，不仅提高了数据质量，而且更好的提升算法模型性能。数据预处理在数据挖掘、自然语言处理、机器学习、深度学习算法中">
<meta name="twitter:image" content="https://i.imgur.com/SDWsDew.png">



  <link rel="alternate" href="/atom.xml" title="白宁超的官网" type="application/atom+xml" />




  <link rel="canonical" href="https://bainingchao.github.io/2018/12/24/Python数据预处理：机器学习、人工智能通用技术（1）/"/>



<script type="text/javascript" id="page.configurations">
  CONFIG.page = {
    sidebar: "",
  };
</script>

  <title>Python数据预处理：机器学习、人工智能通用技术（1） | 白宁超的官网</title>
  









  <noscript>
  <style type="text/css">
    .use-motion .motion-element,
    .use-motion .brand,
    .use-motion .menu-item,
    .sidebar-inner,
    .use-motion .post-block,
    .use-motion .pagination,
    .use-motion .comments,
    .use-motion .post-header,
    .use-motion .post-body,
    .use-motion .collection-title { opacity: initial; }

    .use-motion .logo,
    .use-motion .site-title,
    .use-motion .site-subtitle {
      opacity: initial;
      top: initial;
    }

    .use-motion {
      .logo-line-before i { left: initial; }
      .logo-line-after i { right: initial; }
    }
  </style>
</noscript><!-- hexo-inject:begin --><!-- hexo-inject:end -->

</head>

<body itemscope itemtype="http://schema.org/WebPage" lang="zh-CN">

  
  
    
  

  <!-- hexo-inject:begin --><!-- hexo-inject:end --><div class="container sidebar-position-left page-post-detail">
    <div class="headband"></div>

	<!-- <a href="https://github.com/bainingchao"><img style="position: absolute; top: 0; right: 0; border: 0;" src="https://s3.amazonaws.com/github/ribbons/forkme_right_red_aa0000.png" alt="Fork me on GitHub"></a> !-->
	
    <header id="header" class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-wrapper">
  <div class="site-meta ">
    

    <div class="custom-logo-site-title">
      <a href="/" class="brand" rel="start">
        <span class="logo-line-before"><i></i></span>
        <span class="site-title">白宁超的官网</span>
        <span class="logo-line-after"><i></i></span>
      </a>
    </div>
    
      
        <h1 class="site-subtitle" itemprop="description">专注人工智能领域研究</h1>
      
    
  </div>

  <div class="site-nav-toggle">
    <button aria-label="切换导航栏">
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
    </button>
  </div>
</div>



<nav class="site-nav">
  
    <ul id="menu" class="menu">
      
        
        
        
          
          <li class="menu-item menu-item-首页">
    <a href="/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-home"></i> <br />首页</a>
  </li>
        
        
        
          
          <li class="menu-item menu-item-标签">
    <a href="/tags/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-tags"></i> <br />标签</a>
  </li>
        
        
        
          
          <li class="menu-item menu-item-分类">
    <a href="/categories/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-th"></i> <br />分类</a>
  </li>
        
        
        
          
          <li class="menu-item menu-item-归档">
    <a href="/archives/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-archive"></i> <br />归档</a>
  </li>
        
        
        
          
          <li class="menu-item menu-item-视频">
    <a href="/videos/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-sitemap"></i> <br />视频</a>
  </li>
        
        
        
          
          <li class="menu-item menu-item-书籍">
    <a href="/books/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-th"></i> <br />书籍</a>
  </li>
        
        
        
          
          <li class="menu-item menu-item-链接">
    <a href="/links/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-question-circle"></i> <br />链接</a>
  </li>
        
        
        
          
          <li class="menu-item menu-item-关于">
    <a href="/about/" rel="section">
      <i class="menu-item-icon fa fa-fw fa-user"></i> <br />关于</a>
  </li>

      
      
        <li class="menu-item menu-item-search">
          
            <a href="javascript:;" class="popup-trigger">
          
            
              <i class="menu-item-icon fa fa-search fa-fw"></i> <br />搜索</a>
        </li>
      
    </ul>
  

  

  
    <div class="site-search">
      
  <div class="popup search-popup local-search-popup">
  <div class="local-search-header clearfix">
    <span class="search-icon">
      <i class="fa fa-search"></i>
    </span>
    <span class="popup-btn-close">
      <i class="fa fa-times-circle"></i>
    </span>
    <div class="local-search-input-wrapper">
      <input autocomplete="off"
             placeholder="搜索..." spellcheck="false"
             type="text" id="local-search-input">
    </div>
  </div>
  <div id="local-search-result"></div>
</div>



    </div>
  
</nav>



  



</div>
    </header>

    


    <main id="main" class="main">
      <div class="main-inner">
        <div class="content-wrap">
          
            

          
          <div id="content" class="content">
            

  <div id="posts" class="posts-expand">
    

  

  
  
  

  

  <article class="post post-type-normal" itemscope itemtype="http://schema.org/Article">
  
  
  
  <div class="post-block">
    <link itemprop="mainEntityOfPage" href="https://bainingchao.github.io/2018/12/24/Python数据预处理：机器学习、人工智能通用技术（1）/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="name" content="白宁超">
      <meta itemprop="description" content="本站主要研究深度学习、机器学习、自然语言处理等前沿技术。ML&NLP交流群：436303759 <span><a target="_blank" href="http://shang.qq.com/wpa/qunwpa?idkey=ef3bbb679b06ac59b136c57ba9e7935ff9d3b10faeabde6e4efcafe523bbbf4d"><img border="0" src="http://pub.idqqimg.com/wpa/images/group.png" alt="自然语言处理和机器学习技术QQ交流：436303759 " title="自然语言处理和机器学习技术交流"></a></span>">
      <meta itemprop="image" content="/../images/header.png">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="白宁超的官网">
    </span>

    
      <header class="post-header">

        
        
          <h2 class="post-title" itemprop="name headline">Python数据预处理：机器学习、人工智能通用技术（1）
              
            
          </h2>
        

        <div class="post-meta">
          <span class="post-time">

            
            
            

            
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              
                <span class="post-meta-item-text">发表于</span>
              

              
                
              

              <time title="创建时间：2018-12-24 16:40:05" itemprop="dateCreated datePublished" datetime="2018-12-24T16:40:05+08:00">2018-12-24</time>
            

            
              

              
                
                <span class="post-meta-divider">|</span>
                

                <span class="post-meta-item-icon">
                  <i class="fa fa-calendar-check-o"></i>
                </span>
                
                  <span class="post-meta-item-text">更新于</span>
                
                <time title="修改时间：2019-03-06 16:23:24" itemprop="dateModified" datetime="2019-03-06T16:23:24+08:00">2019-03-06</time>
              
            
          </span>

          
            <span class="post-category" >
            
              <span class="post-meta-divider">|</span>
            
              <span class="post-meta-item-icon">
                <i class="fa fa-folder-o"></i>
              </span>
              
                <span class="post-meta-item-text">分类于</span>
              
              
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing"><a href="/categories/数据预处理/" itemprop="url" rel="index"><span itemprop="name">数据预处理</span></a></span>

                
                
              
            </span>
          

          
            
          

          
          

          
            <span class="post-meta-divider">|</span>
            <span class="post-meta-item-icon"
            >
            <i class="fa fa-eye"></i>
             阅读次数： 
            <span class="busuanzi-value" id="busuanzi_value_page_pv" ></span>
            </span>
          
		  

          

          

        </div>
      </header>
    

    
    
    
    <div class="post-body" itemprop="articleBody">

      
      

      
        <blockquote>
<p>摘要：大数据技术与我们日常生活越来越紧密，要做大数据，首要解决数据问题。原始数据存在大量不完整、不一致、有异常的数据，严重影响到数据建模的执行效率，甚至可能导致模型结果的偏差，因此要数据预处。数据预处理主要是将原始数据经过文本抽取、数据清理、数据集成、数据处理、数据变换、数据降维等处理后，不仅提高了数据质量，而且更好的提升算法模型性能。数据预处理在数据挖掘、自然语言处理、机器学习、深度学习算法中起着重要的作用。（本文原创，转载必须注明出处.）</p>
</blockquote>
<a id="more"></a>
<h2 id="什么是数据预处理"><a href="#什么是数据预处理" class="headerlink" title="什么是数据预处理"></a>什么是数据预处理</h2><p>数据预处理简而言之就是将<code>原始数据</code>装进一个<code>预处理的黑匣子</code>之后，产生出<code>高质量数据</code>用来适应相关技术或者算法模型。为了大家更明确的了解数据预处理，我们举个新闻分类的例子：</p>
<ul>
<li>将原始的数据直接进行分类模型训练，分类器准确率和召回率都比较低。因为我们原始数据存在很多干扰项，比如<code>的</code>,<code>是</code>等这些所谓停用词特征对分类起的作用不大，很难达到工程应用。</li>
<li>我们将原始数据放假预处理黑匣子后，会自动过滤掉干扰数据，并且还会按照规约的方法体现每个词特征的重要性，然后将词特征压缩变换在数值型矩阵中，再通过分类器就会取得不错的效果，可以进行工程应用。</li>
</ul>
<p>总结：数据预处理前的数据存在不完整、偏态、噪声、特征比重、特征维度、缺失值、错误值等问题；数据预处理后的数据存在完整、正态、干净、特征比重合适、特征维度合理、无缺失值等优点。</p>
<p>数据预处理方法：</p>
<ul>
<li>数据清理：通过填写缺失的值、光滑噪声数据、识别或删除离群点并解决不一致性来清理数据。主要目标：格式标准化，异常数据清除，错误纠正，重复数据的清除。</li>
<li>数据集成：将数据由多个数据源合并成一个一致的数据存储，如数据仓库。 </li>
<li>数据变换：通过平滑聚集，数据概化，规范化等方式将数据转换成适用于的形式。如把数据压缩到0.0-1.0区间。</li>
<li>数据归约：往往数据量非常大，在少量数据上进行挖掘分析需要很长的时间，数据归约技术可以用来得到数据集的归约表示，它小得多，但仍然接近于保持原数据的完整性，并结果与归约前结果相同或几乎相同。可以通过如聚集、删除冗余特征或聚类来降低数据的规模。 </li>
</ul>
<h2 id="为什么做这门课程"><a href="#为什么做这门课程" class="headerlink" title="为什么做这门课程"></a>为什么做这门课程</h2><p>在初期学习阶段，大家精力着重于算法模型和调参上。实际情况是，有时候在算法改进上花费很多功夫，却不如在数据质量上的些许提高来的明显。另外，习惯于数据语料的拿来主义之后，当面对新的任务时候，却不知道如何下手？有的同学在处理英语时候游刃有余，面对中文数据预处理却不知所措。基于以上几个问题，结合作者工程经验，整理出了‘数据预处理’学习资料，本教程主要面对文本信息处理，在图片语音等数据语料处理上是有所区别的。</p>
<h2 id="本课程能学到什么"><a href="#本课程能学到什么" class="headerlink" title="本课程能学到什么"></a>本课程能学到什么</h2><p><img src="https://i.imgur.com/SDWsDew.png" alt=""></p>
<ul>
<li>文本批量抽取：涉及技术点包括pywin32插件安装使用、文档文本提取、PDF文本提取、文本抽取器的封装、方法参数的使用、遍历文件夹、编码问题、批量抽取文本信息。</li>
<li>数据清洗：包括yield生成器、高效读取文件、正则表达式的使用、清洗网页数据、清洗字符串、中文的繁简互相转换、缺失值的处理、噪声数据、异常数据清洗、批量清洗30万条新闻数据。</li>
<li>数据处理：包括结巴分词精讲、HanLP精讲、停用词的处理、NLTK的安装使用、高频词和低频词的处理、词性的选择、特征数据的提取、批量预处理30万条新闻数据。</li>
<li>数据向量化：包括词袋模型、词集模型、词向量的转化、缺失值和数据均衡、语料库技术、TFIDF、特征词比重、主成分分析、主题模型等、批量进行30万条数据向量化。</li>
<li>可视化技术：包括条形图、柱形图、散点图、饼图、热力图等，还有matplotlib、seabom、Axes3D综合使用进行三维可视化。</li>
<li>XGBoost竞赛神器：包括监督学习、文本分类、XGBoost原理、XGBoost算法实现、XGBoost调参、算法性能评估、30万条文档生成词典、30万条文档转化TFIDF、30万条文档转化生成LSI、训练分类器模型、抽样改进模型算法、特征维度改进模型算法、XGBoost实现30万条新闻数据文本分类</li>
</ul>
<p>综上所述：数据预处理整体包括数据抽取—&gt;数据清洗—&gt;数据处理—&gt;数据向量化—&gt;可视化分析—&gt;模型构建。在整个过程中，我们每个章节相关性很强，首先对整个章节最终实现效果进行演示，然后拆分知识点分别讲解，最后将所有知识点整合起来做小节的实战。每个小节实战数据为下一个章节做铺垫，最后，一个综合实战分类案例串联所有知识点。</p>
<h2 id="开发环境说明"><a href="#开发环境说明" class="headerlink" title="开发环境说明"></a>开发环境说明</h2><ul>
<li>开发语言:  Python3.5.3</li>
<li>系统环境：window10操作系统</li>
<li>编程环境：Sublime</li>
<li>软件环境：Anaconda4.4.0</li>
<li>插件版本：均支持最新版本</li>
</ul>
<blockquote>
<p>sublime激活</p>
</blockquote>
<p>打开Help &gt;Enter  LICENSE</p>
<pre>
----- BEGIN LICENSE -----
sgbteam
Single User License
EA7E-1153259
8891CBB9 F1513E4F 1A3405C1 A865D53F
115F202E 7B91AB2D 0D2A40ED 352B269B
76E84F0B CD69BFC7 59F2DFEF E267328F
215652A3 E88F9D8F 4C38E3BA 5B2DAAE4
969624E7 DC9CD4D5 717FB40C 1B9738CF
20B3C4F1 E917B5B3 87C38D9C ACCE7DD8
5F7EF854 86B9743C FADC04AA FB0DA5C0
F913BE58 42FEA319 F954EFDD AE881E0B
------ END LICENSE ------
</pre>

<blockquote>
<p>解决Package Control报错</p>
</blockquote>
<p>Package Control.sublime-settings]修改方法：Preferences &gt; Package Settings &gt; Package Control &gt; Settings - User<br>添加：</p>
<p><pre>
"channels":
    [
        "http://cst.stu.126.net/u/json/cms/channel_v3.json",
        //"https://packagecontrol.io/channel_v3.json",
        //"https://web.archive.org/web/20160103232808/https://packagecontrol.io/channel_v3.json",
        //"https://gist.githubusercontent.com/nick1m/660ed046a096dae0b0ab/raw/e6e9e23a0bb48b44537f61025fbc359f8d586eb4/channel_v3.json"
    ]
</pre></p>
<h2 id="项目演示"><a href="#项目演示" class="headerlink" title="项目演示"></a>项目演示</h2><p><strong>原始数据</strong></p>
<p><img src="https://i.imgur.com/267NwEe.png" alt=""></p>
<p><strong>数据预览</strong></p>
<p><img src="https://i.imgur.com/NpPy5tv.png" alt=""></p>
<p><strong>数据清洗</strong><br><img src="https://i.imgur.com/y29tD5L.png" alt=""></p>
<p><strong>生成词典</strong></p>
<p><img src="https://i.imgur.com/BOWGJ3s.png" alt=""></p>
<p><strong>生成特征向量</strong><br><img src="https://i.imgur.com/FONPEnW.png" alt=""></p>
<p><strong>生成LSI</strong><br><img src="https://i.imgur.com/xC3xztL.png" alt=""></p>
<p><strong>XGBoost新闻数据文本分类</strong><br><img src="https://i.imgur.com/h5ANCh0.png" alt=""></p>
<h2 id="目录列表"><a href="#目录列表" class="headerlink" title="目录列表"></a>目录列表</h2><p>☆ 理论介绍<br>★ 实战演练</p>
<h3 id="第1章-课程介绍"><a href="#第1章-课程介绍" class="headerlink" title="第1章 课程介绍"></a>第1章 课程介绍</h3><blockquote>
<p>本章介绍课程概要与学习导读</p>
</blockquote>
<ul>
<li>1-1 为什么做这门课—☆</li>
<li>1-2 课程整体介绍与导学—☆☆</li>
<li>1-3 学习建议—☆☆</li>
<li>1-4 课程开发环境介绍—☆</li>
<li>1-5 文本分类项目演示—☆</li>
<li>1-6 源码获取说明—☆☆☆</li>
<li>1-7 总结与扩展—☆</li>
</ul>
<h3 id="第2章-Python数据预处理之抽取文本信息"><a href="#第2章-Python数据预处理之抽取文本信息" class="headerlink" title="第2章 Python数据预处理之抽取文本信息"></a>第2章 Python数据预处理之抽取文本信息</h3><blockquote>
<p>本章介绍常见数据类型，数据采集，文本提取面临的瓶颈，打造自己的文本批量抽取工具。</p>
</blockquote>
<ul>
<li>2.1 数据类型与采集方法—☆☆☆</li>
<li>2.2 一堆杂乱无章的数据—☆</li>
<li>2.3 文本抽取问题（3种方法对比）—☆</li>
<li>2.4 Pywin32实现格式转换—☆☆</li>
<li>2.3 Word转换TXT算法—★</li>
<li>2.6 PDF转换TXT算法—★</li>
<li>2.7 文本抽取工具—★★</li>
<li>2.8 文本批量编码—★</li>
<li>2.9 遍历读取文件—★★★</li>
<li>2.10 实战案例1：遍历文件批量抽取新闻文本内容—★★★</li>
<li>2.11 总结与扩展—☆☆</li>
</ul>
<h3 id="第3章-Python数据预处理之清洗文本信息"><a href="#第3章-Python数据预处理之清洗文本信息" class="headerlink" title="第3章 Python数据预处理之清洗文本信息"></a>第3章 Python数据预处理之清洗文本信息</h3><blockquote>
<p>本章介绍数据准备，高效读取文件，网络数据、文本数据清洗工作。</p>
</blockquote>
<ul>
<li>3.1 准备30万条新闻数据—☆</li>
<li>3.2 yield生成器—★</li>
<li>3.3 高效读取文件—★★</li>
<li>3.4 数据缺失值—★★</li>
<li>3.5 脏数据与噪声数据—★★</li>
<li>3.6 正则清洗数据—★★</li>
<li>3.7 清洗HTML数据—★★</li>
<li>3.8 简繁字体转换—★★</li>
<li>3.9 实战案例2：30万条新闻文本数据清洗—★★★</li>
<li>3.10 总结与扩展—☆☆</li>
</ul>
<h3 id="第4章-Python数据预处理之文本处理"><a href="#第4章-Python数据预处理之文本处理" class="headerlink" title="第4章 Python数据预处理之文本处理"></a>第4章 Python数据预处理之文本处理</h3><blockquote>
<p>本章介绍常见分词工具、jieba分词的核心操作、自定义规则提取特征词等方法处理文本数据。</p>
</blockquote>
<ul>
<li>4.1 常见分词工具—☆</li>
<li>4.2 jieba分词（推荐）—★★★</li>
<li>4.3 HanLP分词（扩展）—★★</li>
<li>4.4 自定义去停词—★★</li>
<li>4.5 词频统计—★★</li>
<li>4.6 自定义去高低词频—★★</li>
<li>4.7 自定义规则提取特征词—★★</li>
<li>4.8 实战案例3：6万条新闻文本处理—★★★</li>
<li>4.9 总结与扩展—☆☆</li>
</ul>
<h3 id="第5章-Python数据预处理之文本特征向量化"><a href="#第5章-Python数据预处理之文本特征向量化" class="headerlink" title="第5章 Python数据预处理之文本特征向量化"></a>第5章 Python数据预处理之文本特征向量化</h3><blockquote>
<p>本章介绍词集模型，词袋模型，具体处理偏态数据、缺少值问题，并进行特征向量化操作。</p>
</blockquote>
<ul>
<li>5.1 解析数据文件—★★</li>
<li>5.2 词集模型—★★</li>
<li>5.3 词袋模型—★★</li>
<li>5.4 特征词转文本向量—★★★</li>
<li>5.5 不均衡数据归一化处理—★★</li>
<li>5.6 处理数据缺失值—★★</li>
<li>5.7 实战案例4：新闻文本特征向量化—★★★</li>
<li>5.8 总结与扩展—☆☆</li>
</ul>
<h3 id="第6章-Python数据预处理之gensim文本向量化"><a href="#第6章-Python数据预处理之gensim文本向量化" class="headerlink" title="第6章 Python数据预处理之gensim文本向量化"></a>第6章 Python数据预处理之gensim文本向量化</h3><blockquote>
<p>本章介绍gensim进行文本向量化操作</p>
</blockquote>
<ul>
<li>6.1 gensim介绍—☆☆</li>
<li>6.2 gensim构建语料词典—★</li>
<li>6.3 gensim统计词频特征—★★</li>
<li>6.4 gensim计算IF-IDF—★★</li>
<li>6.5 潜在语义索引—★★★★</li>
<li>6.6 生成主题模型—★★★★</li>
<li>6.7 生成随机映射—★★★★</li>
<li>6.8 分层狄利克雷过程—★★★★</li>
<li>6.9 实战案例6：gensim实现新闻文本特征向量化—★★★★</li>
<li>6.10 总结与扩展—☆☆☆</li>
</ul>
<h3 id="第7章-Python数据预处理之特征降维"><a href="#第7章-Python数据预处理之特征降维" class="headerlink" title="第7章 Python数据预处理之特征降维"></a>第7章 Python数据预处理之特征降维</h3><blockquote>
<p>本章介绍最常见的特征降维方法主成分分析PCA及其实现</p>
</blockquote>
<ul>
<li>7.1 什么是降维—☆☆</li>
<li>7.2 PCA 概述—☆☆☆</li>
<li>7.3 PCA 应用场景—☆☆</li>
<li>7.4 PCA 算法原理—★★★</li>
<li>7.5 PCA 算法实现—★★★</li>
<li>7.6 高维数据向低纬数据映射—★★</li>
<li>7.7 前N个主成分特征—★★</li>
<li>7.8 实战案例5：PCA技术实现新闻文本特征降维—★★★★</li>
<li>7.9 总结与扩展—☆☆</li>
</ul>
<h3 id="第8章-数据可视化分析"><a href="#第8章-数据可视化分析" class="headerlink" title="第8章 数据可视化分析"></a>第8章 数据可视化分析</h3><blockquote>
<p>本章介绍可视化方法之一的matplotlib操作，以及相关可视化图形应用场景</p>
</blockquote>
<ul>
<li>8.1 matplotlib介绍—☆</li>
<li>8.2 matplotlib绘制折线图—★★</li>
<li>8.3 matplotlib绘制散点图—★★</li>
<li>8.4 matplotlib绘制直方图—★★</li>
<li>8.5 matplotlib绘制气温图表—★★</li>
<li>8.6 matplotlib绘制三维图—★★★</li>
<li>8.7 总结与扩展—☆</li>
</ul>
<h3 id="第9章-XGBoost实现30万条新闻数据文本分类"><a href="#第9章-XGBoost实现30万条新闻数据文本分类" class="headerlink" title="第9章 XGBoost实现30万条新闻数据文本分类"></a>第9章 XGBoost实现30万条新闻数据文本分类</h3><blockquote>
<p>本章介绍整合前面所有知识点，主要实现生成词典、生成tfidf向量、生成lsi向量、分类器参数训练、对新文本进行分类。前4个步骤可以看做是分类器的训练过程，而第五个阶段，则是使用训练得到的参数对新文本进行分类。</p>
</blockquote>
<ul>
<li>9.1 有监督学习—☆☆☆</li>
<li>9.2 文本分类方法—☆☆☆</li>
<li>9.3 XGBoost 原理—★★★★</li>
<li>9.4 XGBoost 算法实现—★★★★</li>
<li>9.5 准确率与召回率—☆</li>
<li>9.6 F度量值—☆</li>
<li>9.7 30万条文档生成词典—★★★</li>
<li>9.8 30万条文档转化TFIDF—★★★</li>
<li>9.9 30万条文档转化生成LSI—★★★★</li>
<li>9.10 训练分类器模型—★★★★</li>
<li>9.11 测试分类器模型—★★</li>
<li>9.12 抽样改进模型算法—★★</li>
<li>9.13 特征维度改进模型算法—★★</li>
<li>9.14 训练集和测试集比率改进模型算法—★★</li>
<li>9.15 综合实战：XGBoost实现30万条新闻数据文本分类—★★★★★</li>
<li>9.11 总结与扩展—★★</li>
</ul>
<h2 id="源码获取"><a href="#源码获取" class="headerlink" title="源码获取"></a>源码获取</h2><blockquote>
<p>源码请进【机器学习和自然语言QQ群：436303759】文件下载：<a target="_blank" href="http://shang.qq.com/wpa/qunwpa?idkey=ef3bbb679b06ac59b136c57ba9e7935ff9d3b10faeabde6e4efcafe523bbbf4d"><img border="0" src="http://pub.idqqimg.com/wpa/images/group.png" alt="自然语言处理和机器学习技术QQ交流" title="自然语言处理和机器学习技术交流"></a></p>
</blockquote>
<p><img src="https://i.imgur.com/NEXhm2W.png" alt=""></p>
<h2 id="作者声明"><a href="#作者声明" class="headerlink" title="作者声明"></a>作者声明</h2><blockquote>
<p>本文版权归作者所有，旨在技术交流使用。未经作者同意禁止转载，转载后需在文章页面明显位置给出原文连接，否则相关责任自行承担。</p>
</blockquote>

      
    </div>

    

    
    
    

    
      <div>
        <div id="wechat_subscriber" style="display: block; padding: 10px 0; margin: 20px auto; width: 100%; text-align: center">
    <img id="wechat_subscriber_qcode" src="/uploads/wechat.png" alt="白宁超 wechat" style="width: 200px; max-width: 100%;"/>
    <div>扫一扫关注微信公众号，机器学习和自然语言处理，订阅号datathinks！</div>
</div>

      </div>
    

    
      <div>
        <div style="padding: 10px 0; margin: 20px auto; width: 90%; text-align: center;">
  <div></div>
  <button id="rewardButton" disable="enable" onclick="var qr = document.getElementById('QR'); if (qr.style.display === 'none') {qr.style.display='block';} else {qr.style.display='none'}">
    <span>打赏</span>
  </button>
  <div id="QR" style="display: none;">

    
      <div id="wechat" style="display: inline-block">
        <img id="wechat_qr" src="/images/wechatpay.jpg" alt="白宁超 微信支付"/>
        <p>微信支付</p>
      </div>
    

    
      <div id="alipay" style="display: inline-block">
        <img id="alipay_qr" src="/images/alipay.jpg" alt="白宁超 支付宝"/>
        <p>支付宝</p>
      </div>
    

    

  </div>
</div>

      </div>
    

    

    <footer class="post-footer">
      
        <div class="post-tags">
          
            <a href="/tags/课程导学/" rel="tag"><i class="fa fa-tag"></i> 课程导学</a>
          
        </div>
      

      
      
        <div class="post-widgets">
        

        

        
          
          <div class="social_share">
            
               <div>
                 
  <div class="bdsharebuttonbox">
    <a href="#" class="bds_tsina" data-cmd="tsina" title="分享到新浪微博"></a>
    <a href="#" class="bds_douban" data-cmd="douban" title="分享到豆瓣网"></a>
    <a href="#" class="bds_sqq" data-cmd="sqq" title="分享到QQ好友"></a>
    <a href="#" class="bds_qzone" data-cmd="qzone" title="分享到QQ空间"></a>
    <a href="#" class="bds_weixin" data-cmd="weixin" title="分享到微信"></a>
    <a href="#" class="bds_tieba" data-cmd="tieba" title="分享到百度贴吧"></a>
    <a href="#" class="bds_twi" data-cmd="twi" title="分享到Twitter"></a>
    <a href="#" class="bds_fbook" data-cmd="fbook" title="分享到Facebook"></a>
    <a href="#" class="bds_more" data-cmd="more"></a>
    <a class="bds_count" data-cmd="count"></a>
  </div>
  <script>
    window._bd_share_config = {
      "common": {
        "bdText": "",
        "bdMini": "2",
        "bdMiniList": false,
        "bdPic": ""
      },
      "share": {
        "bdSize": "16",
        "bdStyle": "0"
      },
      "image": {
        "viewList": ["tsina", "douban", "sqq", "qzone", "weixin", "twi", "fbook"],
        "viewText": "分享到：",
        "viewSize": "16"
      }
    }
  </script>

<script>
  with(document)0[(getElementsByTagName('head')[0]||body).appendChild(createElement('script')).src='/static/api/js/share.js?cdnversion='+~(-new Date()/36e5)];
</script>

               </div>
            
            
               <div id="needsharebutton-postbottom">
                 <span class="btn">
                    <i class="fa fa-share-alt" aria-hidden="true"></i>
                 </span>
               </div>
            
          </div>
        
        </div>
      
      

      
        <div class="post-nav">
          <div class="post-nav-next post-nav-item">
            
              <a href="/2018/12/21/数据预处理之抽取文本信息（2）/" rel="next" title="Python数据预处理之抽取文本信息（2）">
                <i class="fa fa-chevron-left"></i> Python数据预处理之抽取文本信息（2）
              </a>
            
          </div>

          <span class="post-nav-divider"></span>

          <div class="post-nav-prev post-nav-item">
            
              <a href="/2019/01/21/简洁全面的Scrapy爬虫技术入门/" rel="prev" title="简洁全面的Scrapy爬虫技术入门">
                简洁全面的Scrapy爬虫技术入门 <i class="fa fa-chevron-right"></i>
              </a>
            
          </div>
        </div>
      

      
      
    </footer>
  </div>
  
  
  
  </article>


  </div>


          </div>
          

  
    <div class="comments" id="comments">
      <div id="lv-container" data-id="city" data-uid="MTAyMC8zOTc5NC8xNjMyMQ=="></div>
    </div>

  
 





        </div>
        
          
  
  <div class="sidebar-toggle">
    <div class="sidebar-toggle-line-wrap">
      <span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
    </div>
  </div>

  <aside id="sidebar" class="sidebar">
    
    <div class="sidebar-inner">

      

      
        <ul class="sidebar-nav motion-element">
          <li class="sidebar-nav-toc sidebar-nav-active" data-target="post-toc-wrap">
            文章目录
          </li>
          <li class="sidebar-nav-overview" data-target="site-overview-wrap">
            站点概览
          </li>
        </ul>
      

      <section class="site-overview-wrap sidebar-panel">
        <div class="site-overview">
          <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
            
              <img class="site-author-image" itemprop="image"
                src="/../images/header.png"
                alt="白宁超" />
            
              <p class="site-author-name" itemprop="name">白宁超</p>
              <p class="site-description motion-element" itemprop="description">本站主要研究深度学习、机器学习、自然语言处理等前沿技术。ML&NLP交流群：436303759 <span><a target="_blank" href="http://shang.qq.com/wpa/qunwpa?idkey=ef3bbb679b06ac59b136c57ba9e7935ff9d3b10faeabde6e4efcafe523bbbf4d"><img border="0" src="http://pub.idqqimg.com/wpa/images/group.png" alt="自然语言处理和机器学习技术QQ交流：436303759 " title="自然语言处理和机器学习技术交流"></a></span></p>
          </div>

          
            <nav class="site-state motion-element">
              
                <div class="site-state-item site-state-posts">
                
                  <a href="/archives">
                
                    <span class="site-state-item-count">65</span>
                    <span class="site-state-item-name">日志</span>
                  </a>
                </div>
              

              
                
                
                <div class="site-state-item site-state-categories">
                  <a href="/categories/index.html">
                    
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                    <span class="site-state-item-count">29</span>
                    <span class="site-state-item-name">分类</span>
                  </a>
                </div>
              

              
                
                
                <div class="site-state-item site-state-tags">
                  <a href="/tags/index.html">
                    
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                      
                    
                    <span class="site-state-item-count">119</span>
                    <span class="site-state-item-name">标签</span>
                  </a>
                </div>
              
            </nav>
          

          
            <div class="feed-link motion-element">
              <a href="/atom.xml" rel="alternate">
                <i class="fa fa-rss"></i>
                RSS
              </a>
            </div>
          

          
            <div class="links-of-author motion-element">
              
                <span class="links-of-author-item">
                  <a href="https://github.com/bainingchao" target="_blank" title="GitHub" rel="external nofollow"><i class="fa fa-fw fa-github"></i>GitHub</a>
                  
                </span>
              
                <span class="links-of-author-item">
                  <a href="https://www.google.com.hk/" target="_blank" title="Google" rel="external nofollow"><i class="fa fa-fw fa-google"></i>Google</a>
                  
                </span>
              
                <span class="links-of-author-item">
                  <a href="https://www.baidu.com/" target="_blank" title="百度" rel="external nofollow"><i class="fa fa-fw fa-globe"></i>百度</a>
                  
                </span>
              
                <span class="links-of-author-item">
                  <a href="https://weibo.com/p/1005056002073632?is_all=1" target="_blank" title="微博" rel="external nofollow"><i class="fa fa-fw fa-weibo"></i>微博</a>
                  
                </span>
              
                <span class="links-of-author-item">
                  <a href="http://www.cnblogs.com/baiboy/" target="_blank" title="博客园" rel="external nofollow"><i class="fa fa-fw fa-globe"></i>博客园</a>
                  
                </span>
              
                <span class="links-of-author-item">
                  <a href="https://mp.weixin.qq.com/s/s97I4gtEJIt5rMivWMkPkQ" target="_blank" title="微信公众号" rel="external nofollow"><i class="fa fa-fw fa-weixin"></i>微信公众号</a>
                  
                </span>
              
            </div>
          

          
          

          
          

          
            
          
          

        </div>
      </section>

      
      <!--noindex-->
        <section class="post-toc-wrap motion-element sidebar-panel sidebar-panel-active">
          <div class="post-toc">

            
              
            

            
              <div class="post-toc-content"><ol class="nav"><li class="nav-item nav-level-2"><a class="nav-link" href="#什么是数据预处理"><span class="nav-number">1.</span> <span class="nav-text">什么是数据预处理</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#为什么做这门课程"><span class="nav-number">2.</span> <span class="nav-text">为什么做这门课程</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#本课程能学到什么"><span class="nav-number">3.</span> <span class="nav-text">本课程能学到什么</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#开发环境说明"><span class="nav-number">4.</span> <span class="nav-text">开发环境说明</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#项目演示"><span class="nav-number">5.</span> <span class="nav-text">项目演示</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#目录列表"><span class="nav-number">6.</span> <span class="nav-text">目录列表</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#第1章-课程介绍"><span class="nav-number">6.1.</span> <span class="nav-text">第1章 课程介绍</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#第2章-Python数据预处理之抽取文本信息"><span class="nav-number">6.2.</span> <span class="nav-text">第2章 Python数据预处理之抽取文本信息</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#第3章-Python数据预处理之清洗文本信息"><span class="nav-number">6.3.</span> <span class="nav-text">第3章 Python数据预处理之清洗文本信息</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#第4章-Python数据预处理之文本处理"><span class="nav-number">6.4.</span> <span class="nav-text">第4章 Python数据预处理之文本处理</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#第5章-Python数据预处理之文本特征向量化"><span class="nav-number">6.5.</span> <span class="nav-text">第5章 Python数据预处理之文本特征向量化</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#第6章-Python数据预处理之gensim文本向量化"><span class="nav-number">6.6.</span> <span class="nav-text">第6章 Python数据预处理之gensim文本向量化</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#第7章-Python数据预处理之特征降维"><span class="nav-number">6.7.</span> <span class="nav-text">第7章 Python数据预处理之特征降维</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#第8章-数据可视化分析"><span class="nav-number">6.8.</span> <span class="nav-text">第8章 数据可视化分析</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#第9章-XGBoost实现30万条新闻数据文本分类"><span class="nav-number">6.9.</span> <span class="nav-text">第9章 XGBoost实现30万条新闻数据文本分类</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#源码获取"><span class="nav-number">7.</span> <span class="nav-text">源码获取</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#作者声明"><span class="nav-number">8.</span> <span class="nav-text">作者声明</span></a></li></ol></div>
            

          </div>
        </section>
      <!--/noindex-->
      

      

    </div>
  </aside>


        
      </div>
    </main>

    <footer id="footer" class="footer">
      <div class="footer-inner">
        <script async src="https://dn-lbstatics.qbox.me/busuanzi/2.3/busuanzi.pure.mini.js">
</script>

<div class="copyright">&copy; <span itemprop="copyrightYear">2019</span>
  <span class="with-love" id="animate">
    <i class="fa fa-user"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">白宁超</span>

  

  
</div>




  



  <!--<div class="powered-by">由 <a class="theme-link" target="_blank" rel="external nofollow" href="https://hexo.io">Hexo</a> 强力驱动 v3.7.1</div> -->



   <!--<span class="post-meta-divider">|</span>-->



   <!--<div class="theme-info">主题 – <a class="theme-link" target="_blank" rel="external nofollow" href="https://theme-next.org">NexT.Gemini</a> v6.4.1</div>-->




        <script async src="//busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>



<div class="busuanzi-count">
  
    <span class="site-uv" title="总访客量">
      <i class="fa fa-user"></i>
      <span class="busuanzi-value" id="busuanzi_value_site_uv"></span>
    </span>
  

  
    <span class="site-pv" title="总访问量">
      <i class="fa fa-eye"></i>
      <span class="busuanzi-value" id="busuanzi_value_site_pv"></span>
    </span>
  
</div>









        
      </div>
    </footer>

    
      <div class="back-to-top">
        <i class="fa fa-arrow-up"></i>
        
      </div>
    

    
	
    

    
  </div>

  

<script type="text/javascript">
  if (Object.prototype.toString.call(window.Promise) !== '[object Function]') {
    window.Promise = null;
  }
</script>


























  
  
    <script type="text/javascript" src="/lib/jquery/index.js?v=2.1.3"></script>
  

  
  
    <script type="text/javascript" src="/lib/velocity/velocity.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/lib/velocity/velocity.ui.min.js?v=1.2.1"></script>
  


  


  <script type="text/javascript" src="/js/src/utils.js?v=6.4.1"></script>

  <script type="text/javascript" src="/js/src/motion.js?v=6.4.1"></script>



  
  


  <script type="text/javascript" src="/js/src/affix.js?v=6.4.1"></script>

  <script type="text/javascript" src="/js/src/schemes/pisces.js?v=6.4.1"></script>



  
  <script type="text/javascript" src="/js/src/scrollspy.js?v=6.4.1"></script>
<script type="text/javascript" src="/js/src/post-details.js?v=6.4.1"></script>



  


  <script type="text/javascript" src="/js/src/bootstrap.js?v=6.4.1"></script>



  



  
    <script type="text/javascript">
      window.livereOptions = {
        refer: '2018/12/24/Python数据预处理：机器学习、人工智能通用技术（1）/'
      };
      (function(d, s) {
        var j, e = d.getElementsByTagName(s)[0];
        if (typeof LivereTower === 'function') { return; }
        j = d.createElement(s);
        j.src = 'https://cdn-city.livere.com/js/embed.dist.js';
        j.async = true;
        e.parentNode.insertBefore(j, e);
      })(document, 'script');
    </script>
  










  

  <script type="text/javascript">
    // Popup Window;
    var isfetched = false;
    var isXml = true;
    // Search DB path;
    var search_path = "search.xml";
    if (search_path.length === 0) {
      search_path = "search.xml";
    } else if (/json$/i.test(search_path)) {
      isXml = false;
    }
    var path = "/" + search_path;
    // monitor main search box;

    var onPopupClose = function (e) {
      $('.popup').hide();
      $('#local-search-input').val('');
      $('.search-result-list').remove();
      $('#no-result').remove();
      $(".local-search-pop-overlay").remove();
      $('body').css('overflow', '');
    }

    function proceedsearch() {
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay"></div>')
        .css('overflow', 'hidden');
      $('.search-popup-overlay').click(onPopupClose);
      $('.popup').toggle();
      var $localSearchInput = $('#local-search-input');
      $localSearchInput.attr("autocapitalize", "none");
      $localSearchInput.attr("autocorrect", "off");
      $localSearchInput.focus();
    }

    // search function;
    var searchFunc = function(path, search_id, content_id) {
      'use strict';

      // start loading animation
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay">' +
          '<div id="search-loading-icon">' +
          '<i class="fa fa-spinner fa-pulse fa-5x fa-fw"></i>' +
          '</div>' +
          '</div>')
        .css('overflow', 'hidden');
      $("#search-loading-icon").css('margin', '20% auto 0 auto').css('text-align', 'center');

      

      $.ajax({
        url: path,
        dataType: isXml ? "xml" : "json",
        async: true,
        success: function(res) {
          // get the contents from search data
          isfetched = true;
          $('.popup').detach().appendTo('.header-inner');
          var datas = isXml ? $("entry", res).map(function() {
            return {
              title: $("title", this).text(),
              content: $("content",this).text(),
              url: $("url" , this).text()
            };
          }).get() : res;
          var input = document.getElementById(search_id);
          var resultContent = document.getElementById(content_id);
          var inputEventFunction = function() {
            var searchText = input.value.trim().toLowerCase();
            var keywords = searchText.split(/[\s\-]+/);
            if (keywords.length > 1) {
              keywords.push(searchText);
            }
            var resultItems = [];
            if (searchText.length > 0) {
              // perform local searching
              datas.forEach(function(data) {
                var isMatch = false;
                var hitCount = 0;
                var searchTextCount = 0;
                var title = data.title.trim();
                var titleInLowerCase = title.toLowerCase();
                var content = data.content.trim().replace(/<[^>]+>/g,"");
                
                var contentInLowerCase = content.toLowerCase();
                var articleUrl = decodeURIComponent(data.url);
                var indexOfTitle = [];
                var indexOfContent = [];
                // only match articles with not empty titles
                if(title != '') {
                  keywords.forEach(function(keyword) {
                    function getIndexByWord(word, text, caseSensitive) {
                      var wordLen = word.length;
                      if (wordLen === 0) {
                        return [];
                      }
                      var startPosition = 0, position = [], index = [];
                      if (!caseSensitive) {
                        text = text.toLowerCase();
                        word = word.toLowerCase();
                      }
                      while ((position = text.indexOf(word, startPosition)) > -1) {
                        index.push({position: position, word: word});
                        startPosition = position + wordLen;
                      }
                      return index;
                    }

                    indexOfTitle = indexOfTitle.concat(getIndexByWord(keyword, titleInLowerCase, false));
                    indexOfContent = indexOfContent.concat(getIndexByWord(keyword, contentInLowerCase, false));
                  });
                  if (indexOfTitle.length > 0 || indexOfContent.length > 0) {
                    isMatch = true;
                    hitCount = indexOfTitle.length + indexOfContent.length;
                  }
                }

                // show search results

                if (isMatch) {
                  // sort index by position of keyword

                  [indexOfTitle, indexOfContent].forEach(function (index) {
                    index.sort(function (itemLeft, itemRight) {
                      if (itemRight.position !== itemLeft.position) {
                        return itemRight.position - itemLeft.position;
                      } else {
                        return itemLeft.word.length - itemRight.word.length;
                      }
                    });
                  });

                  // merge hits into slices

                  function mergeIntoSlice(text, start, end, index) {
                    var item = index[index.length - 1];
                    var position = item.position;
                    var word = item.word;
                    var hits = [];
                    var searchTextCountInSlice = 0;
                    while (position + word.length <= end && index.length != 0) {
                      if (word === searchText) {
                        searchTextCountInSlice++;
                      }
                      hits.push({position: position, length: word.length});
                      var wordEnd = position + word.length;

                      // move to next position of hit

                      index.pop();
                      while (index.length != 0) {
                        item = index[index.length - 1];
                        position = item.position;
                        word = item.word;
                        if (wordEnd > position) {
                          index.pop();
                        } else {
                          break;
                        }
                      }
                    }
                    searchTextCount += searchTextCountInSlice;
                    return {
                      hits: hits,
                      start: start,
                      end: end,
                      searchTextCount: searchTextCountInSlice
                    };
                  }

                  var slicesOfTitle = [];
                  if (indexOfTitle.length != 0) {
                    slicesOfTitle.push(mergeIntoSlice(title, 0, title.length, indexOfTitle));
                  }

                  var slicesOfContent = [];
                  while (indexOfContent.length != 0) {
                    var item = indexOfContent[indexOfContent.length - 1];
                    var position = item.position;
                    var word = item.word;
                    // cut out 100 characters
                    var start = position - 20;
                    var end = position + 80;
                    if(start < 0){
                      start = 0;
                    }
                    if (end < position + word.length) {
                      end = position + word.length;
                    }
                    if(end > content.length){
                      end = content.length;
                    }
                    slicesOfContent.push(mergeIntoSlice(content, start, end, indexOfContent));
                  }

                  // sort slices in content by search text's count and hits' count

                  slicesOfContent.sort(function (sliceLeft, sliceRight) {
                    if (sliceLeft.searchTextCount !== sliceRight.searchTextCount) {
                      return sliceRight.searchTextCount - sliceLeft.searchTextCount;
                    } else if (sliceLeft.hits.length !== sliceRight.hits.length) {
                      return sliceRight.hits.length - sliceLeft.hits.length;
                    } else {
                      return sliceLeft.start - sliceRight.start;
                    }
                  });

                  // select top N slices in content

                  var upperBound = parseInt('1');
                  if (upperBound >= 0) {
                    slicesOfContent = slicesOfContent.slice(0, upperBound);
                  }

                  // highlight title and content

                  function highlightKeyword(text, slice) {
                    var result = '';
                    var prevEnd = slice.start;
                    slice.hits.forEach(function (hit) {
                      result += text.substring(prevEnd, hit.position);
                      var end = hit.position + hit.length;
                      result += '<b class="search-keyword">' + text.substring(hit.position, end) + '</b>';
                      prevEnd = end;
                    });
                    result += text.substring(prevEnd, slice.end);
                    return result;
                  }

                  var resultItem = '';

                  if (slicesOfTitle.length != 0) {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + highlightKeyword(title, slicesOfTitle[0]) + "</a>";
                  } else {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + title + "</a>";
                  }

                  slicesOfContent.forEach(function (slice) {
                    resultItem += "<a href='" + articleUrl + "'>" +
                      "<p class=\"search-result\">" + highlightKeyword(content, slice) +
                      "...</p>" + "</a>";
                  });

                  resultItem += "</li>";
                  resultItems.push({
                    item: resultItem,
                    searchTextCount: searchTextCount,
                    hitCount: hitCount,
                    id: resultItems.length
                  });
                }
              })
            };
            if (keywords.length === 1 && keywords[0] === "") {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-search fa-5x" /></div>'
            } else if (resultItems.length === 0) {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-frown-o fa-5x" /></div>'
            } else {
              resultItems.sort(function (resultLeft, resultRight) {
                if (resultLeft.searchTextCount !== resultRight.searchTextCount) {
                  return resultRight.searchTextCount - resultLeft.searchTextCount;
                } else if (resultLeft.hitCount !== resultRight.hitCount) {
                  return resultRight.hitCount - resultLeft.hitCount;
                } else {
                  return resultRight.id - resultLeft.id;
                }
              });
              var searchResultList = '<ul class=\"search-result-list\">';
              resultItems.forEach(function (result) {
                searchResultList += result.item;
              })
              searchResultList += "</ul>";
              resultContent.innerHTML = searchResultList;
            }
          }

          if ('auto' === 'auto') {
            input.addEventListener('input', inputEventFunction);
          } else {
            $('.search-icon').click(inputEventFunction);
            input.addEventListener('keypress', function (event) {
              if (event.keyCode === 13) {
                inputEventFunction();
              }
            });
          }

          // remove loading animation
          $(".local-search-pop-overlay").remove();
          $('body').css('overflow', '');

          proceedsearch();
        }
      });
    }

    // handle and trigger popup window;
    $('.popup-trigger').click(function(e) {
      e.stopPropagation();
      if (isfetched === false) {
        searchFunc(path, 'local-search-input', 'local-search-result');
      } else {
        proceedsearch();
      };
    });

    $('.popup-btn-close').click(onPopupClose);
    $('.popup').click(function(e){
      e.stopPropagation();
    });
    $(document).on('keyup', function (event) {
      var shouldDismissSearchPopup = event.which === 27 &&
        $('.search-popup').is(':visible');
      if (shouldDismissSearchPopup) {
        onPopupClose();
      }
    });
  </script>





  

  

  
<script>
(function(){
    var bp = document.createElement('script');
    var curProtocol = window.location.protocol.split(':')[0];
    if (curProtocol === 'https') {
        bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';        
    }
    else {
        bp.src = 'http://push.zhanzhang.baidu.com/push.js';
    }
    var s = document.getElementsByTagName("script")[0];
    s.parentNode.insertBefore(bp, s);
})();
</script>


  
  

  
  

  
    
      <script type="text/x-mathjax-config">
    MathJax.Hub.Config({
      tex2jax: {
        inlineMath: [ ['$','$'], ["\\(","\\)"]  ],
        processEscapes: true,
        skipTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code']
      },
      TeX: {equationNumbers: { autoNumber: "AMS" }}
    });
</script>

<script type="text/x-mathjax-config">
    MathJax.Hub.Queue(function() {
      var all = MathJax.Hub.getAllJax(), i;
        for (i=0; i < all.length; i += 1) {
          all[i].SourceElement().parentNode.className += ' has-jax';
        }
    });
</script>
<script type="text/javascript" src="//cdn.jsdelivr.net/npm/mathjax@2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>

    
  


  
  
  
  <script src="/lib/needsharebutton/needsharebutton.js"></script>

  <script>
    
      pbOptions = {};
      
          pbOptions.iconStyle = "box";
      
          pbOptions.boxForm = "horizontal";
      
          pbOptions.position = "bottomCenter";
      
          pbOptions.networks = "Weibo,Wechat,Douban,QQZone,Linkedin,Facebook";
      
      new needShareButton('#needsharebutton-postbottom', pbOptions);
    
    
  </script>

  

  

  

  

  

  

  <!-- 页面点击小红心 -->
	<script type="text/javascript" src="../js/src/love.js"></script><!-- hexo-inject:begin --><!-- Begin: Injected MathJax -->
<script type="text/x-mathjax-config">
  MathJax.Hub.Config({"tex2jax":{"inlineMath":[["$","$"],["\\(","\\)"]],"skipTags":["script","noscript","style","textarea","pre","code"],"processEscapes":true},"TeX":{"equationNumbers":{"autoNumber":"AMS"}}});
</script>

<script type="text/x-mathjax-config">
  MathJax.Hub.Queue(function() {
    var all = MathJax.Hub.getAllJax(), i;
    for(i=0; i < all.length; i += 1) {
      all[i].SourceElement().parentNode.className += ' has-jax';
    }
  });
</script>

<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js">
</script>
<!-- End: Injected MathJax -->
<!-- hexo-inject:end -->
</body>
</html>
